{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Follow these:\n",
    "* https://medium.com/datactw/imdb-dataset-visualization-data-analytics-using-pandas-97b5c6f03c6d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "pip install -q --upgrade pip\n",
    "pip install -q pandas==0.23.0\n",
    "pip install -q numpy==1.14.3\n",
    "pip install -q matplotlib==3.0.3\n",
    "pip install -q seaborn==0.8.1\n",
    "pip install -q PyAthena==1.8.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports & Settings\n",
    "\n",
    "import boto3\n",
    "import sagemaker\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "# Get region \n",
    "session = boto3.session.Session()\n",
    "region_name = session.region_name\n",
    "\n",
    "# Get SageMaker session & default S3 bucket\n",
    "sagemaker_session = sagemaker.Session()\n",
    "bucket = sagemaker_session.default_bucket()\n",
    "\n",
    "# Set Athena database & table \n",
    "database_name = 'dsoaws'\n",
    "table_name = 'amazon_reviews_parquet'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PyAthena imports\n",
    "from pyathena import connect\n",
    "from pyathena.pandas_cursor import PandasCursor\n",
    "from pyathena.util import as_pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "7. TODO:  What is the correlation between the number of reviews and the average rating for each product?  (Answer:  0.029)\n",
    "```\n",
    "SELECT product_id, COUNT(*) AS count_reviews, AVG(star_rating) AS avg_star_rating\n",
    "FROM {}.{} \n",
    "TABLESAMPLE BERNOULLI(10)\n",
    "GROUP BY product_id\n",
    "LIMIT 1000\n",
    "```\n",
    "8. TODO:  What is the correlation between the number of helpful votes and the rating for each review?  (-0.00926)\n",
    "```\n",
    "SELECT helpful_votes, star_rating\n",
    "FROM {}.{}\n",
    "TABLESAMPLE BERNOULLI(10)\n",
    "LIMIT 1000\n",
    "```\n",
    "9. TODO:  What is the correlation between the length of a review and the rating for each review? (-0.1662)\n",
    "```\n",
    "SELECT product_id, LENGTH(review_body) as length_review_body, star_rating\n",
    "FROM {}.{}\n",
    "TABLESAMPLE BERNOULLI(10)\n",
    "LIMIT 1000\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
    "s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set_style = 'seaborn-whitegrid'\n",
    "\n",
    "sns.set(rc={\"font.style\":\"normal\",\n",
    "            \"axes.facecolor\":\"white\",\n",
    "            'grid.color': '.8',\n",
    "             'grid.linestyle': '-',\n",
    "            \"figure.facecolor\":\"white\",\n",
    "            \"figure.titlesize\":20,\n",
    "            \"text.color\":\"black\",\n",
    "            \"xtick.color\":\"black\",\n",
    "            \"ytick.color\":\"black\",\n",
    "            \"axes.labelcolor\":\"black\",\n",
    "            \"axes.grid\":True,\n",
    "            'axes.labelsize':10,\n",
    "           'figure.figsize':(10.0, 10.0),\n",
    "            'xtick.labelsize':10,\n",
    "            'font.size':10,\n",
    "            'ytick.labelsize':10})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helper code to display values on bars\n",
    "\n",
    "def show_values_barplot(axs, space):\n",
    "    def _show_on_plot(ax):\n",
    "        for p in ax.patches:\n",
    "            _x = p.get_x() + p.get_width() + float(space)\n",
    "            _y = p.get_y() + p.get_height()\n",
    "            value = round(float(p.get_width()),2)\n",
    "            ax.text(_x, _y, value, ha=\"left\")\n",
    "\n",
    "    if isinstance(axs, np.ndarray):\n",
    "        for idx, ax in np.ndenumerate(axs):\n",
    "            _show_on_plot(ax)\n",
    "    else:\n",
    "        _show_on_plot(axs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# At the product level, find the correlation between `count_reviews` and `avg_star_rating`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT product_id, COUNT(*) AS count_reviews, AVG(star_rating) AS avg_star_rating\n",
    "FROM {}.{} \n",
    "TABLESAMPLE BERNOULLI(10)\n",
    "GROUP BY product_id\n",
    "LIMIT 1000\n",
    "\"\"\".format(database_name, table_name)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "cursor.execute(statement)\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df = as_pandas(cursor)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df.corr()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# At the review level, find the correlation between `helpful_votes` and `star_rating`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT helpful_votes, star_rating\n",
    "FROM {}.{}\n",
    "TABLESAMPLE BERNOULLI(10)\n",
    "LIMIT 1000\n",
    "\"\"\".format(database_name, table_name)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "cursor.execute(statement)\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df = as_pandas(cursor)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df.corr()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# At the review level, find the correlation between `length(review_body)` and `star_rating`.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT product_id, LENGTH(review_body) as length_review_body, star_rating\n",
    "FROM {}.{}\n",
    "TABLESAMPLE BERNOULLI(10)\n",
    "LIMIT 1000\n",
    "\"\"\".format(database_name, table_name)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "cursor.execute(statement)\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df = as_pandas(cursor)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df.corr()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TODO:  Histogram of LENGTH(review_body)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT LENGTH(review_body) as review_body_length\n",
    "FROM {}.{}\n",
    "TABLESAMPLE BERNOULLI(10)\n",
    "LIMIT 10000\n",
    "\"\"\".format(database_name, table_name)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "cursor.execute(statement)\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df = as_pandas(cursor)\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import matplotlib\n",
    "# x-axis is the review_body_length\n",
    "# y-axis is teh number of reviews\n",
    "matplotlib.pyplot.hist(df['review_body_length'], bins=10,\n",
    "                       range=[0,200],\n",
    "                       density=False,\n",
    "                       weights=None,\n",
    "                       cumulative=False,\n",
    "                       bottom=None,\n",
    "                       histtype='bar',\n",
    "                       align='mid',\n",
    "                       orientation='vertical',\n",
    "                       rwidth=None,\n",
    "                       log=False,\n",
    "                       color=None,\n",
    "                       label=None,\n",
    "                       stacked=False\n",
    "                      )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TODO:  Possibly use this HISTOGRAM() function in Presto - or equivalent (ie. WIDTH_BUCKET, etc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT HISTOGRAM(LENGTH(review_body)) as review_body_length_histo\n",
    "FROM {}.{}\n",
    "TABLESAMPLE BERNOULLI(10)\n",
    "LIMIT 10000\n",
    "\"\"\".format(database_name, table_name)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "cursor.execute(statement)\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df = as_pandas(cursor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "histo_dict = eval(df.values[0][0].replace('=', ':'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_histo = pd.DataFrame.from_dict(histo_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
